library(beeswarm)
library(naniar)
library(zoo)
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
# install.packages("zoo")
library(janitor)
library(dplyr)
# install.packages("GGally")
# library(sets)
library(tidyverse)
library(ggplot2)
library(GGally) # for ggpairs
# install.packages("maps")
# library(maps)
load_file <- function(file_path){
read_csv(file_path)
}
tx_data <- load_file("./../data/COVID-19_cases_TX.csv")
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
county_fips_code = col_character(),
county_name = col_character(),
state = col_character(),
state_fips_code = col_double(),
date = col_date(format = ""),
confirmed_cases = col_double(),
deaths = col_double()
)
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
country_region_code = col_character(),
country_region = col_character(),
sub_region_1 = col_character(),
sub_region_2 = col_logical(),
metro_area = col_logical(),
iso_3166_2_code = col_character(),
census_fips_code = col_logical(),
date = col_date(format = ""),
retail_and_recreation_percent_change_from_baseline = col_double(),
grocery_and_pharmacy_percent_change_from_baseline = col_double(),
parks_percent_change_from_baseline = col_double(),
transit_stations_percent_change_from_baseline = col_double(),
workplaces_percent_change_from_baseline = col_double(),
residential_percent_change_from_baseline = col_double()
)
4199216 parsing failures.
row col expected actual file
3036 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3037 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3038 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3039 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3040 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
.... .......... .................. ....................... ......................................
See problems(...) for more details.
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
county_fips_code = col_character(),
county_name = col_character(),
state = col_character(),
state_fips_code = col_character(),
date = col_date(format = ""),
geo_id = col_character(),
pop_5_years_over = col_logical(),
speak_only_english_at_home = col_logical(),
speak_spanish_at_home = col_logical(),
speak_spanish_at_home_low_english = col_logical(),
pop_15_and_over = col_logical(),
pop_never_married = col_logical(),
pop_now_married = col_logical(),
pop_separated = col_logical(),
pop_widowed = col_logical(),
pop_divorced = col_logical()
)
ℹ Use `spec()` for the full column specifications.
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
subset_census <- cases_plus_census[cols_keep]
cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
subset_mobility <- global_mobility_report[cols_keep]
subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
# global_mobility_report
vis_miss(global_mobility_report, sort_miss = T, warn_large_data= F)

vis_miss(tx_data, sort_miss = T, warn_large_data= F)

vis_miss(subset_census, sort_miss = T, warn_large_data = F)

library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
fips_title="county_fips_code", banks=6,
legend_title="", graphic_title=""){
# Subset for speed
df <- df[c(fips_title, col_val)]
# Get county data
gcounty <- ggplot2::map_data("county")
# USA map data
gusa <- map_data("state")
if (banks > 9){
mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
}
# Format with subregions
fipstab <-
transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
unique() %>%
separate(county, c("region", "subregion"), sep = ",")
# Combine in desired order (NA for missing)
gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))
dis <- df
dis$rprop <- rank(df[col_val])
dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
include.lowest = TRUE)
# Missing data
anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
select(region, subregion) %>%
unique()
gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
fill_vals <- gcounty_pop[col_val]
# Plot
if (legend_title == ""){
legend_title <- col_val
}
if (percentile == FALSE){
# names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
color = "grey", size = 0.1, name="Percent Infected") +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
scale_fill_gradient2()
# scale_fill_gradient(low = "white", high = "red", na.value = "grey")
# scale_fill_gradientn(colours = terrain.colors(10))
}
if (percentile == TRUE){
plt <- ggplot(gcounty_pop) +
geom_polygon(aes(long, lat, group = group, fill = pcls),
color = "grey", size = 0.1) +
geom_polygon(aes(long, lat, group = group),
fill = NA, data = gusa, color = "lightgrey") +
coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
scale_fill_manual(values = mycolors, na.value = "grey") +
# scale_fill_brewer(palette = "viridis", na.value = "grey") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
legend.background = element_rect(fill = NA),
legend.position = "left")
}
plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
plt
}
subset_census
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
subset_census$county_fips_code <-as.integer(subset_census$county_fips_code)
subset_census
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
Ignoring unknown parameters: name

plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11,
legend_title = "Percentile Infected",
graphic_title = "Percentile of Percentage of People Infected by County")

plot_vs_county(subset_census, "pct_deaths", percentile = TRUE, banks=11,
legend_title = "Percentile Deaths",
graphic_title = "Percentile of Percentage of Deaths by County")

census_corr_cols <- c("deaths", "confirmed_cases", "median_income", "male_pop",
"female_pop", "total_pop", "median_age", "worked_at_home")
ggcorr(subset_census[census_corr_cols], low="red", mid="grey", high="blue", hjust= .75, size=3,
label = TRUE, label_size = 3, label_color = "white") + ggplot2::labs(title = "Pearson Correlation of Important Variables")

country_date_pct_change <- global_mobility_report %>% select(country_region_code
| contains("date")
| contains("percent"))
country_date_pct_change